In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [1151]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected ARDS
In [1152]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [1153]:
pd.set_option('display.max_colwidth', None)
In [1154]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [1155]:
del df['Unnamed: 0']
In [1156]:
df.head(5)
Out[1156]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2019-04-03 10.83 10.83 10.83 10.83 10.83 300 -2.432436 0.544830 0.170220 1.247500 11.758013 10.107701 10.932857 NaN 11.518925 0.270000 34.386176 NaN NaN NaN 0.230000 NaN 0.021698 56.349816 NaN NaN 41.886798 49.889543 -637.011844 -6177.258315 73400.0 0.0 636.0 0.0 0.0 0.0 0.0 0.0 636.0 636.0 636.0 636.0 0.0 0 2 2 2 2 2 0 2
1 2019-04-05 11.00 11.30 10.60 11.30 11.30 1900 4.243543 0.494302 0.087996 1.047143 11.547568 10.361004 10.954286 NaN 9.266748 0.700000 35.342920 NaN NaN NaN -0.290000 NaN -0.025022 59.107076 NaN NaN 31.264471 36.107122 2262.988156 -4182.057340 76300.0 0.0 11437.0 0.0 0.0 0.0 0.0 0.0 11437.0 11437.0 11437.0 11437.0 0.0 0 3 3 3 3 3 0 3
2 2019-04-08 11.19 11.33 11.19 11.33 11.33 600 0.265484 0.326664 0.073853 0.917551 11.607804 10.520767 11.064286 0.858333 8.098418 0.140000 36.185132 NaN NaN NaN 0.700000 8.448162 0.065851 59.284135 NaN NaN 33.000473 33.145014 2862.988156 -3052.045370 76900.0 0.0 100.0 0.0 0.0 0.0 0.0 0.0 100.0 100.0 100.0 100.0 0.0 0 1 1 1 1 1 0 1
3 2019-04-09 11.00 11.00 10.02 10.23 10.23 4300 -9.708741 0.200292 0.129249 0.973615 11.616168 10.178118 10.897143 0.691923 9.517253 1.309999 14.493114 NaN NaN NaN -0.170000 6.774873 -0.016346 50.628027 NaN NaN 33.739911 32.668285 405.838125 -3094.129541 72600.0 0.0 2798.0 0.0 0.0 0.0 0.0 0.0 2798.0 2798.0 2798.0 2798.0 0.0 0 1 1 1 1 1 0 1
4 2019-04-10 10.36 10.45 10.15 10.15 10.15 6100 -0.782013 0.259392 0.195168 0.877384 11.709270 9.942159 10.825714 0.597308 8.644180 0.300000 14.493114 NaN NaN NaN -1.440001 5.817349 -0.124245 50.055603 NaN NaN 23.972268 30.237551 -5694.161875 -4770.970379 66500.0 0.0 16702.0 0.0 0.0 0.0 0.0 0.0 16702.0 16702.0 16702.0 16702.0 0.0 0 5 5 5 5 5 0 5
In [1157]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 513 entries, 0 to 512
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       513 non-null    datetime64[ns]
 1   Open                       513 non-null    float64       
 2   High                       513 non-null    float64       
 3   Low                        513 non-null    float64       
 4   Close                      513 non-null    float64       
 5   Adj Close                  513 non-null    float64       
 6   Volume                     513 non-null    int64         
 7   Return                     513 non-null    float64       
 8   Beta                       513 non-null    float64       
 9   Variance                   513 non-null    float64       
 10  AvgTrueRange               513 non-null    float64       
 11  Upperband                  513 non-null    float64       
 12  Lowerband                  513 non-null    float64       
 13  Middleband                 513 non-null    float64       
 14  APO                        511 non-null    float64       
 15  NATR                       513 non-null    float64       
 16  TRANGE                     513 non-null    float64       
 17  DMI                        513 non-null    float64       
 18  MACD                       503 non-null    float64       
 19  MACDSIGNAL                 503 non-null    float64       
 20  MACDHIST                   503 non-null    float64       
 21  MOM                        513 non-null    float64       
 22  PPO                        511 non-null    float64       
 23  ROCP                       513 non-null    float64       
 24  RSI                        513 non-null    float64       
 25  TRIX                       468 non-null    float64       
 26  ULTOSC                     508 non-null    float64       
 27  SLOWK                      513 non-null    float64       
 28  SLOWD                      513 non-null    float64       
 29  AD                         513 non-null    float64       
 30  ADOSC                      513 non-null    float64       
 31  OBV                        513 non-null    float64       
 32  Upward_momentum_created    513 non-null    float64       
 33  Downward_momentum_created  513 non-null    float64       
 34  B5_O_Um                    513 non-null    float64       
 35  B5_C_Um                    513 non-null    float64       
 36  B5_E_Um                    513 non-null    float64       
 37  B5_A_Um                    513 non-null    float64       
 38  B5_N_Um                    513 non-null    float64       
 39  B5_O_Dm                    513 non-null    float64       
 40  B5_C_Dm                    513 non-null    float64       
 41  B5_E_Dm                    513 non-null    float64       
 42  B5_A_Dm                    513 non-null    float64       
 43  B5_N_Dm                    513 non-null    float64       
 44  Verified_status_True       513 non-null    int64         
 45  Verified_status_False      513 non-null    int64         
 46  O                          513 non-null    int64         
 47  C                          513 non-null    int64         
 48  E                          513 non-null    int64         
 49  A                          513 non-null    int64         
 50  N                          513 non-null    int64         
 51  Real_or_Fake_tweet         513 non-null    int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 208.5 KB
In [1158]:
df.shape
Out[1158]:
(513, 52)
In [1159]:
sns.set(font_scale=0.8)
In [1160]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [1161]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [1162]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [1163]:
df.head()
Out[1163]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2019-04-03 10.83 10.83 10.83 10.83 10.83 300 -2.432436 0.544830 0.170220 1.247500 11.758013 10.107701 10.932857 NaN 11.518925 0.270000 34.386176 NaN NaN NaN 0.230000 NaN 0.021698 56.349816 NaN NaN 41.886798 49.889543 -637.011844 -6177.258315 73400.0 0.0 636.0 0.0 0.0 0.0 0.0 0.0 636.0 636.0 636.0 636.0 0.0 0 2 2 2 2 2 0 2 NaN NaN
1 2019-04-05 11.00 11.30 10.60 11.30 11.30 1900 4.243543 0.494302 0.087996 1.047143 11.547568 10.361004 10.954286 NaN 9.266748 0.700000 35.342920 NaN NaN NaN -0.290000 NaN -0.025022 59.107076 NaN NaN 31.264471 36.107122 2262.988156 -4182.057340 76300.0 0.0 11437.0 0.0 0.0 0.0 0.0 0.0 11437.0 11437.0 11437.0 11437.0 0.0 0 3 3 3 3 3 0 3 4.339799 0.042483
2 2019-04-08 11.19 11.33 11.19 11.33 11.33 600 0.265484 0.326664 0.073853 0.917551 11.607804 10.520767 11.064286 0.858333 8.098418 0.140000 36.185132 NaN NaN NaN 0.700000 8.448162 0.065851 59.284135 NaN NaN 33.000473 33.145014 2862.988156 -3052.045370 76900.0 0.0 100.0 0.0 0.0 0.0 0.0 0.0 100.0 100.0 100.0 100.0 0.0 0 1 1 1 1 1 0 1 0.265484 0.002651
3 2019-04-09 11.00 11.00 10.02 10.23 10.23 4300 -9.708741 0.200292 0.129249 0.973615 11.616168 10.178118 10.897143 0.691923 9.517253 1.309999 14.493114 NaN NaN NaN -0.170000 6.774873 -0.016346 50.628027 NaN NaN 33.739911 32.668285 405.838125 -3094.129541 72600.0 0.0 2798.0 0.0 0.0 0.0 0.0 0.0 2798.0 2798.0 2798.0 2798.0 0.0 0 1 1 1 1 1 0 1 -9.708741 -0.102130
4 2019-04-10 10.36 10.45 10.15 10.15 10.15 6100 -0.782013 0.259392 0.195168 0.877384 11.709270 9.942159 10.825714 0.597308 8.644180 0.300000 14.493114 NaN NaN NaN -1.440001 5.817349 -0.124245 50.055603 NaN NaN 23.972268 30.237551 -5694.161875 -4770.970379 66500.0 0.0 16702.0 0.0 0.0 0.0 0.0 0.0 16702.0 16702.0 16702.0 16702.0 0.0 0 5 5 5 5 5 0 5 -0.782013 -0.007851
In [1164]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [1165]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [1166]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [1167]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [1168]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [1169]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [1170]:
df.describe()
Out[1170]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 468.000000 468.000000 468.000000 468.000000 468.000000 4.680000e+02 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 4.680000e+02 468.000000 4.680000e+02 4.680000e+02 4.680000e+02 468.0 4.680000e+02 468.0 468.0 468.0 468.0 468.0 4.680000e+02 4.680000e+02 4.680000e+02 4.680000e+02 468.0 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.0 468.000000 468.000000 468.000000 439.000000 432.000000
mean 6.231966 6.520321 5.948376 6.202714 6.202714 1.332378e+05 0.131768 0.365316 0.164022 0.566142 6.859814 5.604118 6.231966 -0.085735 9.252811 0.607607 35.090860 -0.074624 -0.071995 -0.002629 -0.107265 -1.739377 -0.009020 46.893518 -0.158106 44.877125 4.310474e+01 42.879384 -1.283095e+06 -6.147572e+04 -6.876575e+05 0.0 7.617847e+04 0.0 0.0 0.0 0.0 0.0 7.617847e+04 7.617847e+04 7.617847e+04 7.617847e+04 0.0 0.010684 7.119658 7.130342 7.130342 7.130342 7.130342 0.0 7.130342 -0.079631 -0.002864 0.062072 0.061668
std 1.794156 1.878131 1.719451 1.797703 1.797703 2.029760e+06 6.319855 0.582516 0.368758 0.245625 2.037902 1.585632 1.752478 0.503958 3.376426 0.479898 23.661300 0.296449 0.264274 0.120852 1.113222 7.543412 0.164918 10.856567 0.363290 11.172344 2.326234e+01 21.445443 2.573037e+06 5.640195e+05 2.510702e+06 0.0 3.108380e+05 0.0 0.0 0.0 0.0 0.0 3.108380e+05 3.108380e+05 3.108380e+05 3.108380e+05 0.0 0.138411 41.387998 41.482048 41.482048 41.482048 41.482048 0.0 41.482048 6.502832 0.064116 0.019271 0.019138
min 1.920000 2.120000 1.890000 2.000000 2.000000 0.000000e+00 -22.454810 -1.430488 0.002127 0.180364 2.348566 1.772203 2.170000 -1.883205 3.604446 0.000000 0.023735 -0.807535 -0.764861 -0.402666 -4.600000 -20.646403 -0.388732 19.054480 -0.906122 14.360203 -3.419487e-14 0.249688 -4.166858e+07 -1.158566e+07 -4.915700e+06 0.0 8.000000e+00 0.0 0.0 0.0 0.0 0.0 8.000000e+00 8.000000e+00 8.000000e+00 8.000000e+00 0.0 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.0 1.000000 -24.326841 -0.278747 0.030493 0.030493
25% 5.380000 5.707500 4.915000 5.217500 5.217500 4.100000e+03 -3.603113 0.056009 0.024620 0.390013 5.914356 4.573033 5.180000 -0.373622 6.828162 0.300000 16.237545 -0.246629 -0.226526 -0.058241 -0.660000 -6.416996 -0.107648 39.413086 -0.352046 36.420265 2.472975e+01 25.372604 -6.931572e+05 -1.920576e+04 -2.575250e+05 0.0 4.545756e+03 0.0 0.0 0.0 0.0 0.0 4.545756e+03 4.545756e+03 4.545756e+03 4.545756e+03 0.0 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.0 1.000000 -3.867786 -0.039446 0.046378 0.046303
50% 6.485000 6.820000 6.215000 6.440000 6.440000 1.165000e+04 0.000000 0.294892 0.059216 0.541241 7.207210 5.847653 6.562143 -0.050833 8.547092 0.485000 30.381869 -0.068980 -0.067819 0.007540 -0.100000 -1.048386 -0.018608 47.019483 -0.065763 44.485368 4.017587e+01 41.351320 -4.137615e+05 -4.793660e+03 -4.480000e+04 0.0 1.588800e+04 0.0 0.0 0.0 0.0 0.0 1.588800e+04 1.588800e+04 1.588800e+04 1.588800e+04 0.0 0.000000 3.000000 3.000000 3.000000 3.000000 3.000000 0.0 3.000000 -0.159047 -0.001592 0.055347 0.054839
75% 7.152500 7.462500 6.862500 7.180000 7.180000 3.142500e+04 3.202341 0.576092 0.139392 0.700684 7.731716 6.583447 7.118571 0.201298 10.837856 0.762500 51.761298 0.089954 0.077966 0.053542 0.352500 2.951392 0.058926 54.426653 0.058427 53.009114 6.053474e+01 58.794200 -3.199878e+05 2.768219e+03 1.052750e+05 0.0 4.214803e+04 0.0 0.0 0.0 0.0 0.0 4.214803e+04 4.214803e+04 4.214803e+04 4.214803e+04 0.0 0.000000 5.250000 5.250000 5.250000 5.250000 5.250000 0.0 5.250000 3.118547 0.030709 0.079537 0.077584
max 12.400000 12.400000 11.400000 11.990000 11.990000 4.369460e+07 35.515155 5.013547 3.338192 1.446178 13.872647 10.852795 11.575714 1.629295 26.819372 3.380000 91.369298 0.774456 0.632903 0.497846 4.110000 24.001020 0.703371 74.652083 0.509668 72.848986 9.591839e+01 93.880952 1.069983e+05 1.472361e+05 3.910620e+07 0.0 5.158965e+06 0.0 0.0 0.0 0.0 0.0 5.158965e+06 5.158965e+06 5.158965e+06 5.158965e+06 0.0 2.000000 874.000000 876.000000 876.000000 876.000000 876.000000 0.0 876.000000 35.515155 0.303913 0.101381 0.101381
In [1171]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1172]:
df = df.fillna(df.median())
In [1173]:
df.isna().sum()
Out[1173]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1174]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 468 entries, 45 to 512
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       468 non-null    datetime64[ns]
 1   Open                       468 non-null    float64       
 2   High                       468 non-null    float64       
 3   Low                        468 non-null    float64       
 4   Close                      468 non-null    float64       
 5   Adj Close                  468 non-null    float64       
 6   Volume                     468 non-null    int64         
 7   Return                     468 non-null    float64       
 8   Beta                       468 non-null    float64       
 9   Variance                   468 non-null    float64       
 10  AvgTrueRange               468 non-null    float64       
 11  Upperband                  468 non-null    float64       
 12  Lowerband                  468 non-null    float64       
 13  Middleband                 468 non-null    float64       
 14  APO                        468 non-null    float64       
 15  NATR                       468 non-null    float64       
 16  TRANGE                     468 non-null    float64       
 17  DMI                        468 non-null    float64       
 18  MACD                       468 non-null    float64       
 19  MACDSIGNAL                 468 non-null    float64       
 20  MACDHIST                   468 non-null    float64       
 21  MOM                        468 non-null    float64       
 22  PPO                        468 non-null    float64       
 23  ROCP                       468 non-null    float64       
 24  RSI                        468 non-null    float64       
 25  TRIX                       468 non-null    float64       
 26  ULTOSC                     468 non-null    float64       
 27  SLOWK                      468 non-null    float64       
 28  SLOWD                      468 non-null    float64       
 29  AD                         468 non-null    float64       
 30  ADOSC                      468 non-null    float64       
 31  OBV                        468 non-null    float64       
 32  Upward_momentum_created    468 non-null    float64       
 33  Downward_momentum_created  468 non-null    float64       
 34  B5_O_Um                    468 non-null    float64       
 35  B5_C_Um                    468 non-null    float64       
 36  B5_E_Um                    468 non-null    float64       
 37  B5_A_Um                    468 non-null    float64       
 38  B5_N_Um                    468 non-null    float64       
 39  B5_O_Dm                    468 non-null    float64       
 40  B5_C_Dm                    468 non-null    float64       
 41  B5_E_Dm                    468 non-null    float64       
 42  B5_A_Dm                    468 non-null    float64       
 43  B5_N_Dm                    468 non-null    float64       
 44  Verified_status_True       468 non-null    int64         
 45  Verified_status_False      468 non-null    int64         
 46  O                          468 non-null    int64         
 47  C                          468 non-null    int64         
 48  E                          468 non-null    int64         
 49  A                          468 non-null    int64         
 50  N                          468 non-null    int64         
 51  Fake_news                  468 non-null    int64         
 52  returns                    468 non-null    float64       
 53  log_returns                468 non-null    float64       
 54  vol_current                468 non-null    float64       
 55  vol_future                 468 non-null    float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 208.4 KB
In [1175]:
df.shape
Out[1175]:
(468, 56)
In [1176]:
df=df.dropna()
In [1177]:
df.dtypes
Out[1177]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1178]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[1178]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0786852f50>
In [1179]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [1180]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
NATR            0.715404
Upperband       0.676749
vol_future      0.664798
TRANGE          0.639400
High            0.626427
vol_current     0.602496
Open            0.583150
Middleband      0.582814
Adj Close       0.562202
Close           0.562202
TRIX            0.556655
Variance        0.533609
Low             0.518880
Name: AvgTrueRange, dtype: float64
In [1181]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 4 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.715404
vol_future      0.614154
vol_current     0.542756
Name: NATR, dtype: float64
In [1182]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 2 strongly correlated values with TRANGE:
TRANGE          1.0000
AvgTrueRange    0.6394
Name: TRANGE, dtype: float64
In [1183]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Openness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999997
Volume                       0.985550
B5_A_Dm                      0.879724
B5_E_Dm                      0.879724
B5_C_Dm                      0.879724
B5_O_Dm                      0.879724
Downward_momentum_created    0.879724
OBV                          0.717737
Verified_status_True         0.680394
AD                          -0.704983
ADOSC                       -0.942292
Name: O, dtype: float64
In [1184]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999997
Volume                       0.985550
B5_A_Dm                      0.879724
B5_E_Dm                      0.879724
B5_C_Dm                      0.879724
B5_O_Dm                      0.879724
Downward_momentum_created    0.879724
OBV                          0.717737
Verified_status_True         0.680394
AD                          -0.704983
ADOSC                       -0.942292
Name: C, dtype: float64
In [1185]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999997
Volume                       0.985550
B5_A_Dm                      0.879724
B5_E_Dm                      0.879724
B5_C_Dm                      0.879724
B5_O_Dm                      0.879724
Downward_momentum_created    0.879724
OBV                          0.717737
Verified_status_True         0.680394
AD                          -0.704983
ADOSC                       -0.942292
Name: E, dtype: float64
In [1186]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999997
Volume                       0.985550
B5_A_Dm                      0.879724
B5_E_Dm                      0.879724
B5_C_Dm                      0.879724
B5_O_Dm                      0.879724
Downward_momentum_created    0.879724
OBV                          0.717737
Verified_status_True         0.680394
AD                          -0.704983
ADOSC                       -0.942292
Name: A, dtype: float64
In [1187]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: N, dtype: float64)
In [1188]:
df.columns
Out[1188]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [1189]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1190]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1191]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1192]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1193]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [1194]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_O_Dm:
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.879724
A                            0.879724
E                            0.879724
C                            0.879724
O                            0.879724
Verified_status_False        0.879714
Volume                       0.810023
Verified_status_True         0.600907
OBV                          0.570596
AD                          -0.548004
ADOSC                       -0.768444
Name: B5_O_Dm, dtype: float64
In [1195]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_C_Dm:
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.879724
A                            0.879724
E                            0.879724
C                            0.879724
O                            0.879724
Verified_status_False        0.879714
Volume                       0.810023
Verified_status_True         0.600907
OBV                          0.570596
AD                          -0.548004
ADOSC                       -0.768444
Name: B5_C_Dm, dtype: float64
In [1196]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_E_Dm:
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.879724
A                            0.879724
E                            0.879724
C                            0.879724
O                            0.879724
Verified_status_False        0.879714
Volume                       0.810023
Verified_status_True         0.600907
OBV                          0.570596
AD                          -0.548004
ADOSC                       -0.768444
Name: B5_E_Dm, dtype: float64
In [1197]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_A_Dm:
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.879724
A                            0.879724
E                            0.879724
C                            0.879724
O                            0.879724
Verified_status_False        0.879714
Volume                       0.810023
Verified_status_True         0.600907
OBV                          0.570596
AD                          -0.548004
ADOSC                       -0.768444
Name: B5_A_Dm, dtype: float64
In [1198]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Dm:
Series([], Name: B5_N_Dm, dtype: float64)
In [1199]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999997
Volume                       0.985550
B5_A_Dm                      0.879724
B5_E_Dm                      0.879724
B5_C_Dm                      0.879724
B5_O_Dm                      0.879724
Downward_momentum_created    0.879724
OBV                          0.717737
Verified_status_True         0.680394
AD                          -0.704983
ADOSC                       -0.942292
Name: Fake_news, dtype: float64
In [1200]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Downward_momentum_created :
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.879724
A                            0.879724
E                            0.879724
C                            0.879724
O                            0.879724
Verified_status_False        0.879714
Volume                       0.810023
Verified_status_True         0.600907
OBV                          0.570596
AD                          -0.548004
ADOSC                       -0.768444
Name: Downward_momentum_created, dtype: float64
In [1201]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [1202]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 15 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
Fake_news                    0.680394
A                            0.680394
E                            0.680394
C                            0.680394
O                            0.680394
Verified_status_False        0.678596
Volume                       0.662111
B5_A_Dm                      0.600907
B5_E_Dm                      0.600907
B5_C_Dm                      0.600907
B5_O_Dm                      0.600907
Downward_momentum_created    0.600907
OBV                          0.502148
ADOSC                       -0.626257
Name: Verified_status_True, dtype: float64
In [1203]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999997
A                            0.999997
E                            0.999997
C                            0.999997
O                            0.999997
Volume                       0.985575
B5_A_Dm                      0.879714
B5_E_Dm                      0.879714
B5_C_Dm                      0.879714
B5_O_Dm                      0.879714
Downward_momentum_created    0.879714
OBV                          0.717688
Verified_status_True         0.678596
AD                          -0.705024
ADOSC                       -0.942339
Name: Verified_status_False, dtype: float64
In [1204]:
sns.set(font_scale=0.8)
In [1205]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [1206]:
df.dtypes
Out[1206]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1207]:
df.isnull().sum()
Out[1207]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1208]:
df.fillna(0, inplace = True)
In [1209]:
df.dropna(inplace=True)
In [1210]:
sns.set(font_scale=0.8)
In [1211]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [1212]:
df.describe()
Out[1212]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 468.000000 468.000000 468.000000 468.000000 468.000000 4.680000e+02 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 4.680000e+02 468.000000 4.680000e+02 4.680000e+02 4.680000e+02 468.0 4.680000e+02 468.0 468.0 468.0 468.0 468.0 4.680000e+02 4.680000e+02 4.680000e+02 4.680000e+02 468.0 468.000000 468.000000 468.000000 468.000000 468.000000 468.000000 468.0 468.000000 468.000000 468.000000 468.000000 468.000000
mean 6.231966 6.520321 5.948376 6.202714 6.202714 1.332378e+05 0.131768 0.365316 0.164022 0.566142 6.859814 5.604118 6.231966 -0.085735 9.252811 0.607607 35.090860 -0.074624 -0.071995 -0.002629 -0.107265 -1.739377 -0.009020 46.893518 -0.158106 44.877125 4.310474e+01 42.879384 -1.283095e+06 -6.147572e+04 -6.876575e+05 0.0 7.617847e+04 0.0 0.0 0.0 0.0 0.0 7.617847e+04 7.617847e+04 7.617847e+04 7.617847e+04 0.0 0.010684 7.119658 7.130342 7.130342 7.130342 7.130342 0.0 7.130342 -0.079631 -0.002864 0.061655 0.061143
std 1.794156 1.878131 1.719451 1.797703 1.797703 2.029760e+06 6.319855 0.582516 0.368758 0.245625 2.037902 1.585632 1.752478 0.503958 3.376426 0.479898 23.661300 0.296449 0.264274 0.120852 1.113222 7.543412 0.164918 10.856567 0.363290 11.172344 2.326234e+01 21.445443 2.573037e+06 5.640195e+05 2.510702e+06 0.0 3.108380e+05 0.0 0.0 0.0 0.0 0.0 3.108380e+05 3.108380e+05 3.108380e+05 3.108380e+05 0.0 0.138411 41.387998 41.482048 41.482048 41.482048 41.482048 0.0 41.482048 6.502832 0.064116 0.018733 0.018475
min 1.920000 2.120000 1.890000 2.000000 2.000000 0.000000e+00 -22.454810 -1.430488 0.002127 0.180364 2.348566 1.772203 2.170000 -1.883205 3.604446 0.000000 0.023735 -0.807535 -0.764861 -0.402666 -4.600000 -20.646403 -0.388732 19.054480 -0.906122 14.360203 -3.419487e-14 0.249688 -4.166858e+07 -1.158566e+07 -4.915700e+06 0.0 8.000000e+00 0.0 0.0 0.0 0.0 0.0 8.000000e+00 8.000000e+00 8.000000e+00 8.000000e+00 0.0 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.0 1.000000 -24.326841 -0.278747 0.030493 0.030493
25% 5.380000 5.707500 4.915000 5.217500 5.217500 4.100000e+03 -3.603113 0.056009 0.024620 0.390013 5.914356 4.573033 5.180000 -0.373622 6.828162 0.300000 16.237545 -0.246629 -0.226526 -0.058241 -0.660000 -6.416996 -0.107648 39.413086 -0.352046 36.420265 2.472975e+01 25.372604 -6.931572e+05 -1.920576e+04 -2.575250e+05 0.0 4.545756e+03 0.0 0.0 0.0 0.0 0.0 4.545756e+03 4.545756e+03 4.545756e+03 4.545756e+03 0.0 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.0 1.000000 -3.867786 -0.039446 0.046764 0.046764
50% 6.485000 6.820000 6.215000 6.440000 6.440000 1.165000e+04 0.000000 0.294892 0.059216 0.541241 7.207210 5.847653 6.562143 -0.050833 8.547092 0.485000 30.381869 -0.068980 -0.067819 0.007540 -0.100000 -1.048386 -0.018608 47.019483 -0.065763 44.485368 4.017587e+01 41.351320 -4.137615e+05 -4.793660e+03 -4.480000e+04 0.0 1.588800e+04 0.0 0.0 0.0 0.0 0.0 1.588800e+04 1.588800e+04 1.588800e+04 1.588800e+04 0.0 0.000000 3.000000 3.000000 3.000000 3.000000 3.000000 0.0 3.000000 -0.159047 -0.001592 0.055347 0.054839
75% 7.152500 7.462500 6.862500 7.180000 7.180000 3.142500e+04 3.202341 0.576092 0.139392 0.700684 7.731716 6.583447 7.118571 0.201298 10.837856 0.762500 51.761298 0.089954 0.077966 0.053542 0.352500 2.951392 0.058926 54.426653 0.058427 53.009114 6.053474e+01 58.794200 -3.199878e+05 2.768219e+03 1.052750e+05 0.0 4.214803e+04 0.0 0.0 0.0 0.0 0.0 4.214803e+04 4.214803e+04 4.214803e+04 4.214803e+04 0.0 0.000000 5.250000 5.250000 5.250000 5.250000 5.250000 0.0 5.250000 3.118547 0.030709 0.077417 0.075861
max 12.400000 12.400000 11.400000 11.990000 11.990000 4.369460e+07 35.515155 5.013547 3.338192 1.446178 13.872647 10.852795 11.575714 1.629295 26.819372 3.380000 91.369298 0.774456 0.632903 0.497846 4.110000 24.001020 0.703371 74.652083 0.509668 72.848986 9.591839e+01 93.880952 1.069983e+05 1.472361e+05 3.910620e+07 0.0 5.158965e+06 0.0 0.0 0.0 0.0 0.0 5.158965e+06 5.158965e+06 5.158965e+06 5.158965e+06 0.0 2.000000 874.000000 876.000000 876.000000 876.000000 876.000000 0.0 876.000000 35.515155 0.303913 0.101381 0.101381
In [1213]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [1214]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [1215]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [1218]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected ARDS
In [1219]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [1220]:
df.columns
Out[1220]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [1221]:
df.shape
Out[1221]:
(513, 52)
In [1222]:
df.isnull().sum()
Out[1222]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           2
NATR                          0
TRANGE                        0
DMI                           0
MACD                         10
MACDSIGNAL                   10
MACDHIST                     10
MOM                           0
PPO                           2
ROCP                          0
RSI                           0
TRIX                         45
ULTOSC                        5
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [1223]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1224]:
df_weekly = df.resample('W').agg('mean')
In [1225]:
df_weekly.shape
Out[1225]:
(143, 51)
In [1226]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[1226]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f077d44fa50>
In [1227]:
sns.set(font_scale=0.8)
In [1228]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [1229]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
TRANGE          0.715597
NATR            0.669201
Upperband       0.660413
High            0.618157
Open            0.581811
Adj Close       0.570510
Close           0.570510
Middleband      0.566156
Variance        0.560593
TRIX            0.533475
Low             0.524425
Name: AvgTrueRange, dtype: float64
In [1230]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 3 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.669201
TRANGE          0.543999
Name: NATR, dtype: float64
In [1231]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 4 strongly correlated values with TRANGE:
TRANGE          1.000000
AvgTrueRange    0.715597
Variance        0.567594
NATR            0.543999
Name: TRANGE, dtype: float64
In [1232]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Openness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999999
Volume                       0.992420
B5_A_Dm                      0.952715
B5_E_Dm                      0.952715
B5_C_Dm                      0.952715
B5_O_Dm                      0.952715
Downward_momentum_created    0.952715
Verified_status_True         0.890375
OBV                          0.692092
AD                          -0.737314
ADOSC                       -0.956932
Name: O, dtype: float64
In [1233]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999999
Volume                       0.992420
B5_A_Dm                      0.952715
B5_E_Dm                      0.952715
B5_C_Dm                      0.952715
B5_O_Dm                      0.952715
Downward_momentum_created    0.952715
Verified_status_True         0.890375
OBV                          0.692092
AD                          -0.737314
ADOSC                       -0.956932
Name: C, dtype: float64
In [1234]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999999
Volume                       0.992420
B5_A_Dm                      0.952715
B5_E_Dm                      0.952715
B5_C_Dm                      0.952715
B5_O_Dm                      0.952715
Downward_momentum_created    0.952715
Verified_status_True         0.890375
OBV                          0.692092
AD                          -0.737314
ADOSC                       -0.956932
Name: E, dtype: float64
In [1235]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with conscientiousness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999999
Volume                       0.992420
B5_A_Dm                      0.952715
B5_E_Dm                      0.952715
B5_C_Dm                      0.952715
B5_O_Dm                      0.952715
Downward_momentum_created    0.952715
Verified_status_True         0.890375
OBV                          0.692092
AD                          -0.737314
ADOSC                       -0.956932
Name: A, dtype: float64
In [1236]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: N, dtype: float64)
In [1237]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1238]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1239]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1240]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1241]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [1242]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_O_Dm:
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_False        0.952720
Fake_news                    0.952715
A                            0.952715
E                            0.952715
C                            0.952715
O                            0.952715
Volume                       0.914958
Verified_status_True         0.846011
OBV                          0.662821
AD                          -0.648112
ADOSC                       -0.874548
Name: B5_O_Dm, dtype: float64
In [1243]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_C_Dm:
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_False        0.952720
Fake_news                    0.952715
A                            0.952715
E                            0.952715
C                            0.952715
O                            0.952715
Volume                       0.914958
Verified_status_True         0.846011
OBV                          0.662821
AD                          -0.648112
ADOSC                       -0.874548
Name: B5_C_Dm, dtype: float64
In [1244]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_E_Dm:
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_False        0.952720
Fake_news                    0.952715
A                            0.952715
E                            0.952715
C                            0.952715
O                            0.952715
Volume                       0.914958
Verified_status_True         0.846011
OBV                          0.662821
AD                          -0.648112
ADOSC                       -0.874548
Name: B5_E_Dm, dtype: float64
In [1245]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_A_Dm:
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_False        0.952720
Fake_news                    0.952715
A                            0.952715
E                            0.952715
C                            0.952715
O                            0.952715
Volume                       0.914958
Verified_status_True         0.846011
OBV                          0.662821
AD                          -0.648112
ADOSC                       -0.874548
Name: B5_A_Dm, dtype: float64
In [1246]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Dm:
Series([], Name: B5_N_Dm, dtype: float64)
In [1247]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999999
Volume                       0.992420
B5_A_Dm                      0.952715
B5_E_Dm                      0.952715
B5_C_Dm                      0.952715
B5_O_Dm                      0.952715
Downward_momentum_created    0.952715
Verified_status_True         0.890375
OBV                          0.692092
AD                          -0.737314
ADOSC                       -0.956932
Name: Fake_news, dtype: float64
In [1248]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Downward_momentum_created :
B5_A_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_False        0.952720
Fake_news                    0.952715
A                            0.952715
E                            0.952715
C                            0.952715
O                            0.952715
Volume                       0.914958
Verified_status_True         0.846011
OBV                          0.662821
AD                          -0.648112
ADOSC                       -0.874548
Name: Downward_momentum_created, dtype: float64
In [1249]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [1250]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
Fake_news                    0.890375
A                            0.890375
E                            0.890375
C                            0.890375
O                            0.890375
Verified_status_False        0.889848
Volume                       0.885258
B5_A_Dm                      0.846011
B5_E_Dm                      0.846011
B5_C_Dm                      0.846011
B5_O_Dm                      0.846011
Downward_momentum_created    0.846011
OBV                          0.628015
AD                          -0.649531
ADOSC                       -0.846103
Name: Verified_status_True, dtype: float64
In [1251]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999999
A                            0.999999
E                            0.999999
C                            0.999999
O                            0.999999
Volume                       0.992415
B5_A_Dm                      0.952720
B5_E_Dm                      0.952720
B5_C_Dm                      0.952720
B5_O_Dm                      0.952720
Downward_momentum_created    0.952720
Verified_status_True         0.889848
OBV                          0.692061
AD                          -0.737331
ADOSC                       -0.956947
Name: Verified_status_False, dtype: float64
In [1252]:
sns.set(font_scale=0.8)
In [1253]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [1254]:
df_weekly.fillna(0, inplace = True)
In [1255]:
df_weekly.dropna(inplace=True)
In [1256]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [1257]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();